* by Kohei Takeda and Atsushi Yamagishi
* This data deals with 1933-1975 population and employment (establishments). 

cap log close
clear all
set more off 
graph set window fontface "Ariel"

ssc install geodist, replace 

* Choose which version of 1945 population to use
* 1945 population data version. 1 = Building damage from 原爆戦災誌 2 = pre-post poulation ratio from 市政要覧. 3: mix of the two.
* The paper adopts method 3, so set subvar=3 
local subvar = 3

import dbase using "../../data/raw data/popemp_33_75_cleaned6.dbf"  


*****************************
* Part 1: Using 1933-75 data
*****************************

* Renaming variables 
rename 旧町名 block
rename 全壊建 tdest_build 
rename 半壊建 halfdamage_build 
rename 小破建 smalldamage_build 
rename 無事建 nodamage_build 
rename 即死率 death_rate 
rename 負傷率 injury_rate
rename 無傷率 noinjury_rate 
rename 被爆数 total_hibaku 
rename 直爆数 total_chokubaku 
rename _45死数 total_death
drop lon lat 
* These coordiantes are calculated in ESPG4612. Centroids of the popemp_33_75_cleaned5.shp
rename xcoord lon
rename ycoord lat
rename area_3 size
* Convert the area size from m^2 to km^2
replace size = size/1000000

* Merge the distance from epicenter. Source = 原爆戦災誌　第一巻
merge 1:1 block using "../../data/raw data/広島原爆デジタルアトラス/distance_epicenter_atlas.dta"
keep if _merge ==3
drop _merge
* Merge information on public housing 1946--1950.
merge 1:1 block using "../../data/raw data/publichouse/pubhouse4650_bytown.dta"
drop _merge
* Add agriculture employment share
merge m:1 出張所 using "../../data/raw data/others/hiroshima1950_agshare.dta"
keep if _merge ==3
drop _merge
* Add controls used for the revised version
merge 1:1 block using "../../data/raw data/others/addedcontrols_oct2024.dta"
keep if _merge ==3
drop _merge


/* Sample selection */

gen heiwakouen = 0
replace heiwakouen = 1 if block=="中島本町"|block=="材木町"|block=="天神町"|block=="元柳町"|block=="木挽町"|block=="中島新町"
* Drop six blocks that later contitute heiwa kinen kouen
drop if heiwakouen == 1

* Drop two exceptional obs with no pre-bombing population data:
drop if block == "比治山公園"|block == "基町"


* If these rate are all zero, it means missing values for damages
gen missingdamage_human = (death_rate+injury_rate+noinjury_rate==0)
replace death_rate =. if missingdamage_human == 1
replace injury_rate =. if missingdamage_human == 1
replace noinjury_rate =. if missingdamage_human == 1
replace total_hibaku =. if missingdamage_human == 1
replace total_chokubaku =. if missingdamage_human == 1
replace total_death =. if missingdamage_human == 1

* No missing record for building damage.
gen missingdamage_build = 0
replace missingdamage_build = 1 if tdest_build==.

* Calculating population density for each year 
foreach t in 33 34 35 36 49 50 51 52 53 55 60 65 70 75{
  gen popdens`t' = pop`t'/size
  gen l_popdens`t' = ln(pop`t'/size) 
}

* Density of stores and factories in 1938.
gen stordens38 = stor38/size
gen l_stordens38 = ln(stordens38)
gen factdens38 = fact38/size
gen l_factdens38 = ln(factdens38)
* Sum of factories (工業) and stores (商業) = establishments
gen est38 = stor38 + fact38

** Drop two blocks in which employment 38 is zero (no missing value)
gen est38_zero = (est38==0)
gen est38_missing = (est38==.)
drop if est38==0|est38==.

* Large lower bound theta. The employment data was ambiguous for 白島北町.　Yasuda gakuen was there (yasuda elementary school opened in 1956) https://ja.wikipedia.org/wiki/%E7%99%BD%E5%B3%B6%E5%8C%97%E7%94%BA
* 鳥屋町 has extremely large Rthetalb_69. 鳥屋町 is right next to 基町 and 原爆ドーム. Something special? (Small park? https://www.hiroshimapeacemedia.jp/?p=111365)
* In 石見屋町, エリザベト音楽大学 opened in 1963, which presumably reduced population during 1960--1965
drop if block=="白島北町"|block=="鳥屋町"|block=="石見屋町"
* Other blocks with high theta (for L, 1945)
*drop if block=="白島九軒町"|block=="吉島町"|block=="段原東浦町"|block=="段原末広町"


drop if missingdamage_build==1
* CBD = midpoint of 八丁堀 and 紙屋町
gen temp_lat = lat if block == "八丁堀"
gen temp_lon = lon if block == "八丁堀"
replace temp_lat = lat if block == "紙屋町"
replace temp_lon = lon if block == "紙屋町"
replace temp_lat = 0 if temp_lat == . 
replace temp_lon = 0 if temp_lon ==. 

egen cbd_lat = total(temp_lat)
replace cbd_lat = cbd_lat/2
egen cbd_lon = total(temp_lon) 
replace cbd_lon = cbd_lon/2

drop temp_lat temp_lon

geodist cbd_lat cbd_lon lat lon, generate(cbd_dist)
drop if cbd_dist > 6
drop cbd_dist cbd_lat cbd_lon

/* Sample selection ends. 197 blocks -> 174 blocks */ 

* Density of establishments.
foreach t in 38 57 63 66 75{
  gen estdens`t' = est`t'/size
  gen l_estdens`t' = ln(estdens`t')
}

** Adjustment for employment 53 (exclude agriculture workers)
* 0.5 is the labor participation rate among farm households
replace emp53 = emp53 - pop53*agshare*0.5

* Density of employment. (1938 is defined later)
foreach t in 53 57 63 66 75{
  gen empdens`t' = emp`t'/size
  gen l_empdens`t' = ln(empdens`t')
}


*****
* How to calculate the 1945 population (i.e., effect of the war)

** Method 1. Use the damage to buildings from 原爆戦災誌
gen surviverate = 0.01 
replace surviverate = 1 - (tdest_build /100) if 100 - tdest_build >= 1

** Method 2. Use the distance to epicenter and the population in Nov 1 data from 広島市政要覧 (S22, p42)

gen surviverate2 = 1455/60894
replace surviverate2 = 5925 / 65578 if distance_epicenter == 3
replace surviverate2 = 11472 / 64954 if distance_epicenter == 4
replace surviverate2 = 31072 / 52150 if distance_epicenter == 5
replace surviverate2 = 27458 / 27168 if distance_epicenter == 6
replace surviverate2 = 59136 / 41523 if distance_epicenter >6

* Aggregate more than 3km, within 1km
gen distance_epicenter2 = distance_epicenter
replace distance_epicenter2 = 7 if distance_epicenter>6
replace distance_epicenter2=1 if distance_epicenter < 3


** Method 3: Get coefficients for predict the 1945 population from building damage
gen pop36_r = round(pop36, 1)

frame copy default new
frame new{

collapse (mean) surviverate2 tdest_build halfdamage_build [fw = pop36_r], by(distance_epicenter2) 

gen tdest_build2 = tdest_build^2

reg surviverate2 tdest_build tdest_build2
gen tdest_0 = _b[_cons]
gen tdest_1 = _b[tdest_build]
gen tdest_2 = _b[tdest_build2]
** Figure A4(a) in paper
#d;
twoway (scatter surviverate2 tdest_build, sort color(black) msize(small)) 
(qfit surviverate2 tdest_build, sort color(cranberry) lwidth(medthick) range(0 100)),
	scheme(s1color) plotregion(lwidth(none) ilwidth(none))
   legend(off)
	xtitle("Percentage of totally destroyed buildings")
	ytitle("Population change rate due to the bombing") ylabel(, angle(horizontal));
	graph export "../../output/figure/Popchange_bomb_tdest.pdf", as(pdf)   replace ;
#d cr
}

* _frval() returns values of variables stored in other frames. It returns var's ith observation (here, 1st)

gen tdest_0 = _frval(new, tdest_0, 1)
gen tdest_1 = _frval(new, tdest_1, 1)
gen tdest_2 = _frval(new, tdest_2, 1)


gen tdest_build2 = tdest_build^2


if `subvar' == 1{

gen pop45bomb = pop36*surviverate
gen popdens45bomb = pop45bomb / size
gen l_popdens45bomb = ln(popdens45bomb)

*gen est45bomb = est38*surviverate
*gen estdens45bomb = est45bomb / size
*gen l_estdens45bomb = ln(estdens45bomb)
}

if `subvar' == 2{

gen pop45bomb = pop36*surviverate2
gen popdens45bomb = pop45bomb / size
gen l_popdens45bomb = ln(popdens45bomb)
* Although this survive rate is based on population, use this for establishments too
*gen est45bomb = est38*surviverate2
*gen estdens45bomb = est45bomb / size
*gen l_estdens45bomb = ln(estdens45bomb)
}

if `subvar' == 3{
gen surviverate3 = tdest_1*tdest_build + tdest_2*tdest_build2 + tdest_0
gen pop45bomb = pop36*surviverate3
gen popdens45bomb = pop45bomb / size
gen l_popdens45bomb = ln(popdens45bomb)
* Although this survive rate is based on population, use this for establishments too
*gen est45bomb = est38*surviverate3
*gen estdens45bomb = est45bomb / size
*gen l_estdens45bomb = ln(estdens45bomb)
* In this case, obs is not meaningful if building damage is missingdamage_build
drop if missingdamage_build==1
}

** Merge with the validation data, school district population from 1946 shisei-youran
merge 1:1 block using "../../data/raw data/Hiroshima_Shisei_Yoran_Sengo/pop194546_schooldistrict.dta"	
drop _merge 

** Validation of the surviverate3, which is the imputated population change due to the a-bomb.
* ratio is the population change ratio by school districts, taken from S21 広島市勢要覧

reg ratio surviverate3
corr ratio surviverate3

reg surviverate3 ratio
** Figure A5 in paper
#d;
twoway (scatter ratio surviverate3, sort color(black) msize(small)) 
(lfit ratio surviverate3, sort color(cranberry) lwidth(medthick)),
	scheme(s1color) plotregion(lwidth(none) ilwidth(none))
    legend(off)
	xtitle("Imputed population change rate (as of November 1945)")
	ytitle("Population change rate (as of August 1946)") ylabel(, angle(horizontal));
	graph export "../../output/figure/validation_pop1945.pdf", as(pdf)   replace ;
#d cr


* The ratio of recovery from the initial population: 1951/1945 or 1960.1945
* 基町 has exceptionally high recov rate. Exclude this when using recov in the analysis.
gen recov5145 = pop51/pop45bomb
gen recov6045 = pop60/pop45bomb

* To construct a placebo, get the population change rate from 1934--36 (pre-trend).
* Take the geometric averate to minimize idiosyncracy. pretrend is a 1-year growth rate.
gen pretrend1 = pop34/pop33
gen pretrend2 = pop35/pop34
gen pretrend3 = pop36/pop35
gen pretrend = (pretrend1 * pretrend2 * pretrend3)^(1/3)

* How to deal with extreme growth at the block level?
* Censor if 4 times growth in 15 years. (1.1^15 >4)
* Same for low trend
replace pretrend = 1.1 if pretrend>1.1
replace pretrend = 0.9 if pretrend < 0.9

gen pretrend_q = pretrend^2
gen pretrend_c = pretrend^3

* High density means lower growth rate, so even the fast-growing areas would stop growing in the end.
* But linear pre-trend assumption means that this is not taken into account.
reg pretrend l_popdens33

** Population in 1950, 1951, 1960, 1975 predicted by the pre-trend.
gen pop50_prewartrend = pop36*(pretrend^14)
gen pop51_prewartrend = pop36*(pretrend^15)
gen pop60_prewartrend = pop36*(pretrend^24)
gen pop75_prewartrend = pop36*(pretrend^39)

label var block "Name"
label var tdest_build "Rate of total destroy Building"
label var halfdamage_build "Rate of half damage buildings"
label var smalldamage_build "Rate of small damage buildings"
label var nodamage_build "Rate of no damage building"
label var death_rate "Rate of immediate death" 
label var injury_rate "Rate of injury" 


* CBD = midpoint of 八丁堀 and 紙屋町
gen temp_lat = lat if block == "八丁堀"
gen temp_lon = lon if block == "八丁堀"
replace temp_lat = lat if block == "紙屋町"
replace temp_lon = lon if block == "紙屋町"
replace temp_lat = 0 if temp_lat == . 
replace temp_lon = 0 if temp_lon ==. 

egen cbd_lat = total(temp_lat)
replace cbd_lat = cbd_lat/2
egen cbd_lon = total(temp_lon) 
replace cbd_lon = cbd_lon/2

drop temp_lat temp_lon

* Compute distance to CBD
geodist cbd_lat cbd_lon lat lon, generate(cbd_dist)
label var cbd_dist "Distance to CBD, km"

* Compute distance to the epicenter. https://www.chugoku-np.co.jp/articles/-/51892
geodist 34.394593542 132.454797056 lat lon, generate(epi_dist)
label var epi_dist "Distance to the epicenter, km"

* Compute distance to Ujina port. Coordinates taken from https://ja.wikipedia.org/wiki/%E5%BA%83%E5%B3%B6%E6%B8%AF
geodist 34.352167 132.455119 lat lon, generate(port_dist)
label var port_dist "Distance to Ujina port, km"

* Focus within 6km to eliminate outliers (似島, 草津南町）
drop if cbd_dist>6
* create GEOCODE 
egen geocode = group(block) 
* Correspondence between geocode and blockname
export delimited block geocode using "../../data/processed data/geocode_blockname.csv", replace
** Based on the distance to CBD, impute the number of establishments in November 1945. 

frame create new2
frame change new2

import excel "../../data/raw data/Hiroshima_Shisei_Yoran_Sengo/buildingbydistance_1946_changerate.xlsx", sheet("Sheet1") firstrow

gen mdistance2 = mdistance^2
gen mdistance3 = mdistance^3

reg 変化率45NOV mdistance mdistance2 mdistance3
predict yhat
gen ch45_0 = _b[_cons]
gen ch45_1 = _b[mdistance]
gen ch45_2 = _b[mdistance2]
gen ch45_3 = _b[mdistance3]

frame change default

* _frval() returns values of variables stored in other frames. It returns var's ith observation (here, 1st)

gen ch45_0 = _frval(new2, ch45_0, 1)
gen ch45_1 = _frval(new2, ch45_1, 1)
gen ch45_2 = _frval(new2, ch45_2, 1)
gen ch45_3 = _frval(new2, ch45_3, 1)

gen epi_dist2 = epi_dist^2
gen epi_dist3 = epi_dist^3

** Case 1 of employment imputation in 1945. Use distance from the epicenter.

*gen changerate_est45 = ch45_1*epi_dist + ch45_2*epi_dist2 + ch45_3*epi_dist3 + ch45_0
* To avoid negative change rate, replace with the destruction rate at 250m point and 4.75km points for extreme points (ie avoid extrapolation)
* Also, explicitly avoid the negative predicted value
*replace changerate_est45 = ch45_1*0.25 + ch45_2*(0.25^2) + ch45_3*(0.25^3) + ch45_0 if changerate_est45 < 0 | epi_dist<0.25
*replace changerate_est45 = ch45_1*4.75 + ch45_2*(4.75^2) + ch45_3*(4.75^3) + ch45_0 if epi_dist > 4.75

** Case 2 of employment imputation in 1945. Use the fraction of totally destructed buildings.
* For coefficients, see buildingbydistance_1946.xlsx (sheet 3)
* For the 100% destruction, the predicted value is slightly negative. Use the change rate at 99% destruction

gen changerate_est45 = -0.0044653*tdest_build - 0.0000611*(tdest_build^2) + 1.048608
replace changerate_est45 = -0.0044653*99 - 0.0000611*(99^2) + 1.048608 if changerate_est45<0

gen est45bomb = est38*changerate_est45
gen estdens45bomb = est45bomb / size
gen l_estdens45bomb = ln(estdens45bomb)


** Convert establishment info into employment. 

* This definition of employment 36: rescale the establishment distribution to the population.
egen totalest38 = total(est38)
egen totalpop36 = total(pop36)
* Note: This means that everyone is assumed to work in 1938.
* Total population in 1938 is taken from http://demography.blog.fc2.com/blog-entry-371.html
gen emp38 = 384906 * est38 / totalest38
gen empdens38 = emp38 / size
gen l_empdens38 = ln(empdens38)

* employment distribution (establishment distribution scaled by the population)
egen totalest45 = total(est45bomb)
egen totalpop45 = total(pop45bomb)
gen emp45bomb = totalpop45 * est45bomb / totalest45
gen empdens45bomb = emp45bomb/size
gen l_empdens45bomb = ln(empdens45bomb)

* Now, the total of emp38, 45 is the total population (not the total workforce)
* From S11 Hiroshima-shi toukei sho, we calculate the workforce ratio 44.2% in 1936
replace emp38 = emp38 * 0.442
replace emp45bomb = emp45bomb * 0.442



** Save files. Note that the file is saved before rescaling employment data after WW2. 
save "../../data/processed data/1933_75_popemp.dta", replace 

* Generate intermadiate years data: 1957, 1963, 1969 population, 1969 employment.
gen pop57 = 0.6*pop55 + 0.4*pop60
gen pop63 = 0.4*pop60 + 0.6*pop65
gen pop69 = 0.2*pop65 + 0.8*pop70
gen emp69 = (2/3)*emp66 + (1/3)*emp75

* 1953
egen totalemp53 = total(emp53)
egen totalpop53 = total(pop53)
replace emp53 = totalpop53 * emp53 / totalemp53

* 1957
egen totalemp57 = total(emp57) 
egen totalpop55 = total(pop55)
egen totalpop60 = total(pop60)
gen totalpop57 = totalpop55 + 0.4 * (totalpop60 - totalpop55)
replace emp57 = totalpop57 * emp57 / totalemp57


* 1963
egen totalemp63 = total(emp63)
egen totalpop65 = total(pop65)
gen totalpop63 = totalpop60 + 0.6 * (totalpop65 - totalpop60)
replace emp63 = totalpop63 * emp63 / totalemp63

* 1966
egen totalemp66 = total(emp66)
egen totalpop70 = total(pop70)
gen totalpop66 = totalpop65 + 0.2 * (totalpop70 - totalpop65)
replace emp66 = totalpop66 * emp66 / totalemp66


* 1975
egen totalemp75 = total(emp75)
egen totalpop75 = total(pop75)
replace emp75 = totalpop75 * emp75 / totalemp75

* For 1951 employment, scale back 1953 employment a bit.
egen totalpop51 = total(pop51)
* Note that total(emp53)=total(pop53) after our normalization.
gen emp51 = emp53 * totalpop51 / totalpop53


* Total size adjustment for the employment 
* 1969
egen totalemp69 = total(emp69)
gen totalpop69 = 0.2*totalpop65 + 0.8*totalpop70
replace emp69 = totalpop69 * emp69 / totalemp69


** Compute the lowerbound. Separately for R and L

gen Rthetalb_51 = 1 - pop51/pop45bomb
gen Rthetalb_57 = 1 - pop57/pop51
gen Rthetalb_63 = 1 - pop63/pop57
gen Rthetalb_69 = 1 - pop69/pop63
gen Rthetalb_75 = 1 - pop75/pop69

sum Rthetalb_51, d
sum Rthetalb_57, d
sum Rthetalb_63, d
sum Rthetalb_69, d
sum Rthetalb_75, d

gen Lthetalb_51 = 1 - emp51/emp45bomb
gen Lthetalb_57 = 1 - emp57/emp51
gen Lthetalb_63 = 1 - emp63/emp57
gen Lthetalb_69 = 1 - emp69/emp63
gen Lthetalb_75 = 1 - emp75/emp69

sum Lthetalb_51, d
sum Lthetalb_57, d
sum Lthetalb_63, d
sum Lthetalb_69, d
sum Lthetalb_75, d
